Code
library(topicmodels)
library(tidyverse)
library(tidygraph)
library(ggraph)
library(arrow)
library(tidytext)gtr_desc = read_parquet("data/gtr.parquet") |>
select(id, abstractText)
gtr_meta = read_csv("data/projectsearch-1709481069771.csv") |>
mutate(across(ends_with("Date"), ~as.Date(.x,"%d/%m/%Y"))) |>
rename(id=ProjectId)
PARTIAL_PTTN='(/[1-9])$'
## waggle some columns for analytics
gtr_pj = gtr_meta |>
mutate(
is_partial = str_detect(ProjectReference, PARTIAL_PTTN),
project_ref = str_replace(ProjectReference,PARTIAL_PTTN,""),
part = str_extract(ProjectReference, PARTIAL_PTTN) |>
str_extract("\\d+") |>
as.numeric() |>
coalesce(0)
) |>
# filter(is_partial) |>
group_by(project_ref) |>
mutate(occurance = n()) |>
ungroup() |>
dplyr::relocate(ProjectReference, FundingOrgName, LeadROName, id,
is_partial,project_ref,part,occurance)
## early stop if this is no longer true
stopifnot(
"Project Reference is NOT Unique!"=length(unique(gtr_pj$ProjectReference)) == nrow(gtr_pj),
"Project Refrence contain NA!"=!any(is.na(gtr_pj$ProjectReference))
)
## find out about
unique_prj = gtr_pj |>
relocate(ProjectReference, project_ref, id) |>
group_by(project_ref) |>
mutate(rn=row_number()) |>
filter(rn==1) |>
select(-rn) |>
ungroup()
## find out with text other than continuous project are repeated
repeated_text = gtr_desc |>
group_by(abstractText) |>
mutate(n=n()) |>
filter(n!=1) |>
arrange(abstractText)
## Take the repeated test one out for now.
analysis_prj = unique_prj |>
anti_join(repeated_text, by="id") |>
mutate(year = lubridate::year(StartDate)) |>
inner_join(gtr_desc, by="id")## break into bi-grame
abstract_words = analysis_prj |>
unnest_tokens(word, abstractText, "ngrams",n=2, drop=T) |>
count(word,id, sort=T)
## try this `bind_tf_idf` function
word_distinct = abstract_words |>
bind_tf_idf(word,id, n)
## tf_idf is good for sceope out distinct words about something?
word_distinct |>
arrange(desc(tf_idf))Joining with `by = join_by(word)`
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Research Grant and Studentship have “rich” abstract we can extract on. You probably what to filter down to Research Grant before modeling
abstract_word_count = abstract_words |>
count(word,id,sort=T)
## this is very expensive process so cached this result for saving rendering time
if(interactive()) {
gtr_dtm = abstract_word_count |>
cast_dtm(id,word,n)
topics = gtr_dtm |> topicmodels::LDA(k=10)
saveRDS(topics, "cache/05-topics.rds")
} else {
topics=readRDS("cache/05-topics.rds")
}topics |>
tidy("beta") |>
group_by(topic) |>
slice_max(beta,n=15) |>
ggplot(aes(y=reorder_within(term, beta,topic),x= beta )) +
geom_col() +
scale_y_reordered() +
facet_wrap(~ topic,scales="free_y") +
theme_minimal() +
xlab("beta (higher indicate more relevant to topic)") +
ylab("term") +
ggtitle("LDA model output result (beta)")
This is a typical graph used to visualize LDA output. beta indicate importance of words to extracted topic.
However, it is still not easy to tell what these topic actually are. We can use n-gram to link most frequently linked words, which will give us a picture of how “these key words” links together.
## gamma score over the years
## top gamma's n-gram
topic_binded = topics |>
tidy("gamma") |>
filter(gamma > 0.8) |> # 0.8 is a number we can be sure there is only one topic
# for one document
rename(id=document) |>
left_join(analysis_prj, "id")
topic_binded |>
select(topic, ProjectCategory) |>
mutate(topic_chr=fct_inorder(as.character(topic)) ) |>
complete(topic_chr,ProjectCategory) |>
ggplot() +
geom_bin2d(aes(x=topic_chr, y = ProjectCategory)) +
theme_minimal() +
scale_fill_viridis_c(option = "F",trans="sqrt") +
ggtitle("Topic Against Porject Category") +
coord_equal()
## For compute bigram ---------------------------------------------
#' Calculate bigram given document and field
#' @param doc_df document number
#' @param field column of text field
#' @param doc_id id indicating text comes from in fact
compute_bi_gram = function(doc_df,field) {
.gvars = group_vars(doc_df)
if(length(.gvars)==0) {
Gvars=quo(NULL)
} else {
Gvars=quo({any_of(.gvars)})
}
doc_df |>
# select({{field}},!!Gvars ) |>
unnest_tokens(pharse,{{field}},'ngrams',n=2) |>
separate(pharse, into=c("word1","word2"),sep=" ") |>
## clear up stopwords
anti_join(stop_words, c("word1"="word")) |>
anti_join(stop_words, c("word2"="word")) |>
filter(!is.na(word1) & !is.na(word2))
}
#' customer counting function that also
count_bi_gram = function(bi_gram,...) {
bi_gram |>
group_by(word1,word2, .add=T) |>
group_by(..., .add=T) |>
summarise(n=n(),.groups="drop") |>
arrange(desc(n))
}
## plot functions --------------------------------
plot_beta=function(lda_model,ki,max_n=15) {
lda_model |>
tidy("beta") |>
filter(topic == ki) |>
group_by(topic) |>
slice_max(beta,n=max_n) |>
ggplot(aes(y=reorder_within(term, beta,topic),x= beta )) +
geom_col(fill="royalblue3") +
scale_y_reordered() +
theme_minimal()
}
plot_word_graph=function(bi_gram_tokens,
ki,
top_n=15,
model=NULL,
color= "cyan4"
) {
if(!is.null(model)) {
beta=tidy(model,"beta") |>
filter(topic == ki)
.add_beta = function(x) {
activate(x,nodes) |>
left_join(beta, by=c("name"="term"))
}
.add_node_marker = function() {
geom_node_point(aes(size=beta))
}
} else {
.add_beta = \(x) x
.add_node_marker = \() geom_node_point(size=5)
}
typical_graph=bi_gram_tokens |>
count_bi_gram(topic) |>
relocate(word1,word2) |>
as_tbl_graph()
## filter bi-gram graph and plot
g = typical_graph |>
.add_beta() |>
activate(edges) |>
filter(topic == ki) |>
arrange(desc(n)) |>
filter(row_number() <= top_n) |>
activate(nodes) |>
filter(!node_is_isolated())
## plot one single graph
g |>
ggraph('kk') +
geom_edge_link(aes(width=n, alpha=n),color= color) +
geom_node_text(aes(label=name),repel = T) +
.add_node_marker() +
theme_void() +
ggtitle(paste("topic",ki))
}## data ----------------------
## bind topic result
topic_binded=topics |>
tidy("gamma") |>
filter(gamma > 0.8) |> # 0.8 is a number we can be sure there is only one topic
# for one document
rename(id=document) |>
left_join(analysis_prj, "id")
typical_docs = topic_binded |>
group_by(topic) |>
slice_max(gamma,n=1) |>
select(topic,abstractText)
## compute a graphic exploration
## combine to bi-gram this is usually the expensive one
bi_gram_tok = topic_binded |>
compute_bi_gram(abstractText)
## graph -----------------------
plot_topics = function(model,reps_tk,reps_docs,topic_ki,color= "cyan4") {
## topic graph
g1 = reps_tk |>
plot_word_graph(topic_ki,top_n = 20,model=model,color=color) +
ggtitle(paste0("Topic", topic_ki,""))
## beta plot
g2 = model |>
plot_beta(topic_ki,max_n = 20) +
ggtitle("Beta, words most represent topics")
## add example texts
foot_notes = reps_docs |>
filter(topic == topic_ki) |>
pull(abstractText) |>
stringr::str_wrap(160) |>
stringr::str_trunc(500)
g3 = ggplot() + theme_void() + geom_text(aes(x=0,y=0,label = foot_notes)) +
ggtitle("Example")
ggpubr::ggarrange(g1,g2) |>
ggpubr::ggarrange(g3,nrow=2, heights=c(8,5))
}The topic modeling are in fact picking up different types of topics. Particularly “Studentship” is extracted as topic 2, “Collaborative R & D” is almost exclusively extracted as topic 10.
Warning: The `trans` argument of `continuous_scale()` is deprecated as of ggplot2 3.5.0.
ℹ Please use the `transform` argument instead.

Topic 3 seems to be describe about “machine learning”. One of the hot words. It makes no surprise that some how related to student ship.
topic 8, topic 9 both are very scientific. As we can see the description are very sentific and specific to particular field.

topic 8 probably sounds like biology. topic 9 sounds like material science.
It also interesting to see that although term “artificial intelligence” is frequently linked together, the “beta” score is actually low.
The histogram shows us that research grant is the biggest chunk. So let’s apply “research grant” by itself to see if there anything interesting.
Joining with `by = join_by(word)`
res_reps = res_lda |>
tidy("gamma") |>
filter(gamma > 0.75) |> # 0.75 is a number we can be sure there is only one topic
# for one document
rename(id=document) |>
left_join(analysis_prj, "id")
## compute three data sets
res_betas = res_lda |> tidy("beta")
res_bi_gram = res_reps |>
compute_bi_gram(abstractText)
res_reps_top = res_reps |>
group_by(topic) |>
slice_max(gamma,n=1)Just by looking at department of research topic, it seems that some of the topic perhaps not as coherent as we hopped. It is possible that we need less topic than 10 perhaps.
Lets do another sample
So actually this maybe doing alright. Lets analysis department fields.
res_reps |>
select(topic,LeadROName,Department,Title,id) |>
unnest_tokens(word,Department) |>
anti_join(stop_words,"word") |>
# filter(!word %in% c("sch","school",'science')) |>
count(id,word,topic,sort=T) |>
bind_tf_idf(word,id,n) |>
group_by(topic) |>
slice_max(tf_idf,n=10) |>
ggplot(aes(x = tf_idf, y = reorder_within(word, tf_idf, topic) )) +
geom_col(fill="black") +
scale_y_reordered() +
facet_wrap(~ topic,scale="free_y") +
theme_minimal() +
ggtitle("Topic Versus Research Insititution Deparatment")
So 5 and 6 could be similar, 3 and 4 could stand one topic. But we cannot be too sure yet. So
Interestingly, climate change is on longer on this. For continence, we put number 1,7,8 all under one umberalla “AI & Public Well-fare’ which
`summarise()` has grouped output by 'topic_chr'. You can override using the
`.groups` argument.
Cutoff_Year=2010
fast_growing = topic_by_year |>
filter(year >= Cutoff_Year) |>
group_by(topic_chr) |>
arrange(topic_chr,year) |>
mutate(growth = (topic_value - lag(topic_value))/lag(topic_value) ) |>
summarise(avg_growth = mean(growth,na.rm=T)) |>
slice_max(avg_growth, n=2)
latest_top = topic_by_year |>
slice_max(year,n=1) |>
ungroup() |>
slice_max(topic_value,n=1)
topic_by_year |>
ggplot(aes(x=year,y=topic_value)) +
geom_line(aes(color=topic_chr,fill=topic_chr)) +
theme_minimal() +
ylab("topic multiply award") +
gghighlight::gghighlight(
( topic_chr %in% fast_growing$topic_chr
| topic_chr %in% latest_top$topic_chr )
& year >= Cutoff_Year,
line_label_type="sec_axis"
) +
scale_color_brewer(palette="Set2") +
geom_vline(aes(xintercept=Cutoff_Year),linetype="dashed") +
ggtitle(sprintf("Growing Topics Since %s", Cutoff_Year))Warning in geom_line(aes(color = topic_chr, fill = topic_chr)): Ignoring
unknown aesthetics: fill
Warning: Tried to calculate with group_by(), but the calculation failed.
Falling back to ungrouped filter operation...
label_key: topic_chr

I found bi-gram graph compensate traditional beta count graph. Instead of “reading tea leafs”, you can try read along the edge.
bi_gram = topic_binded |>
select(id,abstractText,topic) |>
unnest_tokens(pharse,abstractText,'ngrams',n=2) |>
separate(pharse, into=c("word1","word2"),sep=" ") |>
anti_join(stop_words, c("word1"="word")) |>
anti_join(stop_words, c("word2"="word")) |>
count(topic,word1,word2,sort=T)
typical_graph = bi_gram |>
filter(!is.na(word1) & !is.na(word2)) |>
group_by(topic) |>
slice_max(n,n=20) |>
relocate(word1,word2) |>
as_tbl_graph()
G = list()
for (i in c(2,10,8,9)) {
g = typical_graph |>
activate(edges) |>
filter(topic == i) |>
activate(nodes) |>
filter(!node_is_isolated())
if(length(g)==0) next()
plot_g = g |>
ggraph('kk') +
geom_edge_link(aes(width=n, alpha=n),trans="log",color="royalblue") +
geom_node_text(aes(label=name),repel = T) +
geom_node_point(size=5) +
theme_void() +
ggtitle(paste("topic",i))
G = append(G, list(plot_g))
}
ggpubr::ggarrange(plotlist=G,ncol=2, nrow=2)The topic read